from lxml import etree
# xpath解析适用xml,html
# 另一种导入方式 from lxml import html

html ="""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <ul>
        <li><a href="http://www.baidu.com">百度</a></li>
        <li><a href="http://www.googel.com">谷歌</a></li>
        <li><a href="http://www.soguo.com">搜狗</a></li>
    </ul>
    <ol>
        <li><a href="feiji">飞机</a></li>
        <li><a href="tanke">坦克</a></li>
        <li><a href="huoche">火车</a></li>
    </ol>
    <div class="job">李嘉诚</div>
    <div class="mht">马化腾</div>
</body>
</html>
"""
et = etree.HTML(html)
print(et)
# 通过索引获取需要的数据 从1开始
print(et.xpath("/html/body/ul/li[2]/a/text()")[0])
# 同时获取a标签的超链接和文本
li_list = et.xpath("//li")
for li in li_list:
    # ./ --> 当前节点
    href = li.xpath("./a/@href")[0]
    text = li.xpath("./a/text()")[0]
    print(href,text)


print(et.xpath('//body/div[@class="job"]/text()'))